---
title: "final_estimations"
author: "Matthew Gordon"
date: "2/10/2021"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
rootdir <- dirname(getwd())
```

Load packages and data - create functions

```{r, message=FALSE}
library(tidyverse)
library(estimatr)
library(ggstance)
library(boot)
library(fixest)
library(grid)
library(gridExtra)
library(cowplot)

df.master <- read_csv(paste(rootdir, "/Data/master.csv", sep = ""), 
                      na=c("",".","NA"))

df.str_race <- read_csv(paste(rootdir, "/Data/race_stratified.csv", sep = ""), 
                      na=c("",".","NA"))

df.str_age <- read_csv(paste(rootdir, "/Data/age_stratified.csv", sep = ""), 
                      na=c("",".","NA"))

df.str <- read_csv(paste(rootdir, "/Data/master_stratified.csv", sep = ""), 
                      na=c("",".","NA"))

indvars <- c("pm_idw", "no_idw", "no2_idw")
rests <- 500
geogs <- c("all", "OB")
depvars <- c("cumulative_total_deaths", "cumulative_hospitalized")
set.seed(13)
```

## Define functions

```{r}
poisregc <- function(depvar, ind_var, ivs_raw, data, indices){
        dfi <- data[indices, ]
        npstg1 <- lm_robust(as.formula(paste(ind_var, "~", ivs_raw, "+ near_dist + as.factor(puma) + as.factor(station)")),
                            data = dfi, se_type = "stata")
        dfi$stg1e <- as.vector(unlist(dfi[, ind_var])) - as.vector(npstg1$fitted.values)
        poisreg <- fepois(as.formula(paste(depvar, "~", ind_var, "+ stg1e + near_dist + as.factor(station) | as.factor(puma)")),
                            data = dfi, warn = FALSE, notes = FALSE)
        return(as.numeric(poisreg$coefficients[1]))
}

dy.dx.ame <- function(formula, depvar, data, indices){
        dfi <- data[indices, ]
        naiveregfep <- iv_robust(formula,
                data = dfi, se_type = "HC0", diagnostics = TRUE)
        b1 <- naiveregfep$coefficients[2]
        yhats <- naiveregfep$fitted.values
        ehats <- log(as.vector(unlist(dfi[, depvar])) + 1) - yhats 
        dy.dx <- b1*exp(yhats)*mean(exp(ehats))
        return(mean(dy.dx))
}

poismfx <- function(depvar, ind_var, ivs_raw, data, indices){
        dfi <- data[indices, ]
        npstg1 <- lm_robust(as.formula(paste(ind_var, "~", ivs_raw, "+ near_dist + as.factor(puma) + as.factor(station)")),
                            data = dfi, se_type = "stata")
        dfi$stg1e <- as.vector(unlist(dfi[, ind_var])) - as.vector(npstg1$fitted.values)
        poisreg <- fepois(as.formula(paste(depvar, "~", ind_var, "+ stg1e + near_dist + as.factor(station) | as.factor(puma)")),
                            data = dfi, warn = FALSE, notes = FALSE)
        return(as.numeric(poisreg$coefficients[1])*mean(poisreg$fitted.values))
}

final_estimation <- function(depvar, ind_var, geography, restriction){
        if (geography=="all") {
                df <- df.master
        } else if (geography=="Man") {
                df <- filter(df.master, below_110th==1)
        } else {
                df <- filter(df.master, below_110th==0)
        }
        
        if (restriction==0) {
                df <- df
                ivs_raw <- "downwind_5_r + downwind_3_r + downwind_2_r + 
                                          downwind_1_r + downwind_0_r"
        } else {
                df <- filter(df, near_dist < restriction)
                if (restriction == 2000) {
                        ivs_raw <- "downwind_2_r + downwind_1_r + downwind_0_r"
                } else if (restriction == 1000) {
                        ivs_raw <- "downwind_1_r + downwind_0_r"
                } else {
                        ivs_raw <- "downwind_0_r"
                }
        }
        
        df.i <- select(df, downwind_5_r, downwind_3_r, downwind_2_r,
                          downwind_1_r, downwind_0_r, 
                          pm_idw, no_idw, no2_idw, pm_closest, no_closest, no2_closest)
        df <- df[complete.cases(df.i),]
        
        controls <- "+ tot_population + white + black + hispanic + tot_age65_74 + tot_age75up + median_tenure + tot_ed_none + tot_ed_hs +
                        tot_inc_percap + tot_service_occupation + tot_uninsured + tot_public_transport + tot_institutionalized"

        ### OLS puma
        
        olsregp <- lm_robust(as.formula(paste("log(", depvar, " + 1) ~", ind_var,
                                              "+ near_dist + as.factor(puma) + as.factor(station)")),
                        data = df, subset = tot_population > 500 & near_dist > 50, se_type = "stata")
        
        ### naive iv - puma
        
        formula <- as.formula(paste("log(", depvar, " + 1) ~", ind_var, "+ near_dist + as.factor(puma) + as.factor(station) |",
                                ivs_raw, "+ near_dist + as.factor(puma) + as.factor(station)"))
        
        reg1 <- iv_robust(formula,
                        data = df, subset = tot_population > 500, se_type = "HC1")
        
        reg2 <- iv_robust(formula,
                        data = df, subset = tot_population > 500 & near_dist > 50, se_type = "HC1")
        
        reg3 <- iv_robust(formula,
                        data = df, subset = tot_population > 500 & intersect_ctr==0, se_type = "HC1")
        
        return(list(reg1, reg2, reg3))
}
```

### Estimation - Final tables

```{r, results="asis", message = FALSE}

df.coeffs <- data.frame("geo" = rep(NA, 24),
                        "restriction" = rep(NA, 24),
                        "pollutant" = rep(NA, 24),
                        "outcome" = rep(NA, 24),
                        "model" = rep(NA, 24),
                        "coefficient" = rep(NA, 24),
                        "cih" = rep(NA, 24),
                        "cil" = rep(NA, 24))
q <- 1

for(k in depvars) {
        for (i in indvars) {
                for (r in rests) {
                       for (j in geogs) {
                                f1 <- final_estimation(depvar = k, ind_var = i, geography = j, restriction = r)
                                
                                df.coeffs$geo[q] <- j
                                df.coeffs$restriction[q] <- r
                                df.coeffs$pollutant[q] <- i
                                df.coeffs$outcome[q] <- k
                                df.coeffs$model[q] <- "Within 500m of Nearest Highway" 
                                df.coeffs$coefficient[q] <- f1[[1]]$coefficients[2]
                                df.coeffs$cil[q] <- summary(f1[[1]])$coefficients[2,5]
                                df.coeffs$cih[q] <- summary(f1[[1]])$coefficients[2,6]
                                
                                df.coeffs$geo[q+1] <- j
                                df.coeffs$restriction[q+1] <- r
                                df.coeffs$pollutant[q+1] <- i
                                df.coeffs$outcome[q+1] <- k
                                df.coeffs$model[q+1] <- "Between 50m and 500m of Nearest Highway" 
                                df.coeffs$coefficient[q+1] <- f1[[2]]$coefficients[2]
                                df.coeffs$cil[q+1] <- summary(f1[[2]])$coefficients[2,5]
                                df.coeffs$cih[q+1] <- summary(f1[[2]])$coefficients[2,6]
                                
                                q <- q+2
                       }
                }
        }
}

df.coeffs$geo <- factor(df.coeffs$geo, levels = c("OB", "all"))
df.coeffs$pollutant <- factor(df.coeffs$pollutant, levels = c("pm_idw", "no2_idw", "no_idw"))
df.coeffs$model <- factor(df.coeffs$model, levels = c("Within 500m of Nearest Highway", "Between 50m and 500m of Nearest Highway"))
df.coeffs$outcome <- factor(df.coeffs$outcome, levels = c("cumulative_total_deaths", "cumulative_hospitalized"))

```

### Final figures

```{r}
labels <- c(`cumulative_total_deaths` = "Log Deaths",
            `cumulative_hospitalized` = "Log Hospitalizations",
            `pm_idw` = "PM 2.5",
            `no_idw` = "NO",
            `no2_idw` = "NO2")

mainmods1 <- ggplot(data = df.coeffs, 
       aes(x = coefficient, y = model, color = geo, shape = geo)) +
        geom_vline(aes(xintercept=0)) +
        geom_point(position=position_dodgev(height=0.5), size = 3) +
        geom_pointrange(aes(xmin = cil, xmax = cih),
                        position=position_dodgev(height=0.5)) +
        labs(x = "Effect of a one-unit increase", y = "", color = "Geography", shape = "Geography") +
        scale_color_discrete(labels = c("Outer Boroughs", "Citywide")) +
        scale_shape_discrete(labels = c("Outer Boroughs", "Citywide")) +
        guides(color = guide_legend(reverse = TRUE), shape = guide_legend(reverse = TRUE)) +
        facet_grid(outcome~pollutant, labeller = as_labeller(labels)) +
        theme_light() + 
        theme(text = element_text(size=18), strip.text = element_text(size=20), panel.spacing.x = unit(2, "lines"))

mainmods1

mainmods2 <- ggplot(data = filter(df.coeffs, model == "Between 50m and 500m of Nearest Highway"), 
       aes(x = coefficient, y = geo, color = geo, shape = geo)) +
        geom_vline(aes(xintercept=0)) +
        geom_point(position=position_dodgev(height=0.5), size = 5) +
        geom_pointrange(aes(xmin = cil, xmax = cih),
                        position=position_dodgev(height=0.5)) +
        labs(x = "Causal Effect of One-Unit Increase in Pollutant Concentration", y = "", color = "Sample", shape = "Sample") +
        scale_y_discrete(labels = c("", "")) +
        scale_color_discrete(labels = c("Outer Boroughs", "Citywide")) +
        scale_shape_discrete(labels = c("Outer Boroughs", "Citywide")) +
        guides(color = guide_legend(reverse = TRUE), shape = guide_legend(reverse = TRUE)) +
        facet_grid(outcome~pollutant, labeller = as_labeller(labels)) +
        theme_light() + 
        theme(text = element_text(size=18), strip.text = element_text(size=20), panel.spacing.x = unit(2, "lines"))

mainmods2
```

Compare rate based, count based, OLS and IV regressions

```{r}
model_comps <- function(ind_var, depvar, df, ivs_raw, controls){
        dr = if(depvar == "hosps") "hosp_rate" else "death_rate"
        dc = if(depvar == "hosps") "cumulative_hospitalized" else "cumulative_total_deaths"
        ar = if(depvar == "hosps") "adjhosprate" else "adjdeathrate"

        ### OLS - rate 
        
        olsregp <- lm_robust(as.formula(paste(dr, "~", ind_var,
                                              "+ near_dist + as.factor(puma) + as.factor(station)")),
                        data = df, se_type = "stata")
        
        ### OLS - rate + controls
        
        olsregrc <- lm_robust(as.formula(paste(dr, "~", ind_var, controls,
                                              "+ near_dist + as.factor(puma) + as.factor(station)")),
                        data = df, se_type = "stata")
        
        ### OLS - rate weighted
        
        olsregpw <- lm_robust(as.formula(paste(dr, "~", ind_var, controls,
                                              "+ near_dist + as.factor(puma) + as.factor(station)")),
                        data = df, weights = tot_population, se_type = "stata")
        
        ### OLS - adj rate weighted
        
        olsregpa <- lm_robust(as.formula(paste(ar, "~", ind_var, controls,
                                              "+ near_dist + as.factor(puma) + as.factor(station)")),
                        data = df, weights = adj_population, se_type = "stata")
        
        ### IV count
        
        naiveregfep <- iv_robust(as.formula(paste("log(", dc, " + 1) ~", ind_var, "+ near_dist + as.factor(puma) + as.factor(station) |",
                                ivs_raw, "+ near_dist + as.factor(puma) + as.factor(station)")),
                        data = df, se_type = "HC1")
        
        ### iv count - controls
        
        naiveregfepc <- iv_robust(as.formula(paste("log(", dc, " + 1) ~", ind_var, controls,
                                                  "+ near_dist + as.factor(puma) + as.factor(station) |",
                                ivs_raw, controls, "+ near_dist + as.factor(puma) + as.factor(station)")),
                        data = df, se_type = "HC1", diagnostics = TRUE)
        
        ### IV rate
        
        naiveregfepr <- iv_robust(as.formula(paste(dr, "~", ind_var, "+ near_dist + as.factor(puma) + as.factor(station) |",
                                ivs_raw, "+ near_dist + as.factor(puma) + as.factor(station)")),
                        data = df, se_type = "HC1")
        
        ### iv rate - controls
        
        naiveregfepcr <- iv_robust(as.formula(paste(dr, "~", ind_var, controls,
                                                  "+ near_dist + as.factor(puma) + as.factor(station) |",
                                ivs_raw, controls, "+ near_dist + as.factor(puma) + as.factor(station)")),
                        data = df, se_type = "HC1", diagnostics = TRUE)
        
        return(list(olsregp, olsregrc, olsregpw, olsregpa, naiveregfep, naiveregfepc, naiveregfepr, naiveregfepcr))
}


df1 <- filter(df.master, near_dist < 500, near_dist > 50, tot_population > 500) %>%
  mutate(adj_population = tot_population*(1+device_chg1020),
         adjdeathrate = cumulative_total_deaths*100000/adj_population,
         adjhosprate = cumulative_hospitalized*100000/adj_population)

ivs_raw <- "downwind_0_r"
controls <- " + log(tot_population) + log(tot_inc_percap) + tot_owner_occupied + tot_public_assistance + tot_rent2income50percent +
                    tot_ed_hs + tot_ed_none + black + hispanic + tot_age18_44 + tot_age45_54 + tot_age55_64 + tot_age65_74 + tot_age75up + 
                    device_chg"
models <- c("(1) OLS", "(2) OLS + controls", "(3) OLS pop wts", "(4) OLS adj rate", "(5) IV", 
            "(6) IV + controls", "(7) IV rate", "(8) IV rate + controls")
t = "NO"

f1 <- model_comps("no_idw", "deaths", df1, ivs_raw, controls)
f1h <- model_comps("no_idw", "hosps", df1, ivs_raw, controls)
        
df.comps <- rbind(as.data.frame(do.call(rbind, lapply(f1, function (x) summary(x)$coefficient[2,1:2]))),
                  as.data.frame(do.call(rbind, lapply(f1h, function (x) summary(x)$coefficient[2,1:2]))))
df.comps$model <- factor(rep(models, 2), levels = models)
df.comps$depvar <- c(c(rep("death_rate", 4), rep("deaths", 2), rep("death_rate", 2)), 
                     c(rep("hosp_rate", 4), rep("hospitalizations", 2), rep("hosp_rate", 2)))

labels <- c(`deaths` = "Deaths",
            `hospitalizations` = "Hospitalizations",
            `death_rate` = "Death Rate per 100k",
            `hosp_rate` = "Hospitalization Rate per 100k")
                     
g1 <- ggplot(filter(df.comps, 
                    !(model %in% c("(7) IV rate", "(8) IV rate + controls")) & depvar %in% c("deaths", "hospitalizations")), 
       aes(x = model, y = Estimate, color = model)) +
        geom_hline(aes(yintercept = 0)) + 
        geom_point() +
        geom_pointrange(aes(ymin = Estimate - 1.96*`Std. Error`, ymax = Estimate + 1.96*`Std. Error`)) +
        facet_wrap(~depvar, labeller = as_labeller(labels), ncol = 1) +
        theme_light() + theme(axis.text.x = element_text(angle = 22.5, hjust = .8, vjust = .7), 
                              legend.position = "none", text = element_text(size = 20)) +
        labs(y = "", x = "", title = "") + 
        scale_y_continuous(limits = c(-.75, .75), breaks = seq(-.75, .75, .3))

g2 <- ggplot(filter(df.comps, !(model %in% c("(7) IV rate", "(8) IV rate + controls")) & depvar %in% c("death_rate", "hosp_rate")), 
       aes(x = model, y = Estimate, color = model)) +
        geom_hline(aes(yintercept = 0)) + 
        geom_point() +
        geom_pointrange(aes(ymin = Estimate - 1.96*`Std. Error`, ymax = Estimate + 1.96*`Std. Error`)) +
        facet_wrap(~depvar, labeller = as_labeller(labels), ncol = 1) +
        theme_light() + theme(axis.text.x = element_text(angle = 22.5, hjust = .8, vjust = .9), 
                              legend.position = "none", text = element_text(size = 20)) +
        labs(y = "Coefficient Estimate", x = "", title = t) + 
        scale_y_continuous(limits = c(-30, 30), breaks = seq(-30, 30, 10))

plot_grid(g2, g1, rel_widths = c(5,4))
```

```{r}
library(modelsummary)
modelsummary(f1, coef_omit = "puma")
```
